/*===========================================================================
FILE NAME: In_Text_Numbers.do
CREATED: 3 November 2023
==============================================================================*/

// In-text numbers for page 15 (57 and 54 findings summary)
// Each round & program analyzed individually

//=== Round 2, CAA ===
import excel "$raw_data/Full_Download_of_SRF_Findings_Nov_3_2023.xlsx", first clear
save "$processed_data/Full_Download_of_SRF_Findings_Nov_3_2023.dta", replace

use "$processed_data/Full_Download_of_SRF_Findings_Nov_3_2023.dta", clear
keep if Round=="Round 2"
keep if Agency=="State"
keep if Media=="CAA"
unique State
tab State
gen deficient=0
replace deficient=1 if FindingLevel=="Area for Attention" | FindingLevel=="Area for Improvement"
sort State
collapse (count) ct_findings=deficient (sum) deficient, by(State)
label var ct_findings "# of findings"
label var deficient "# of findings with Area for Attention or Area for Improvement"
gen percent_deficient=100*(deficient/ct_findings)
sum ct_findings percent_deficient, detail
list ct_findings if State=="TX"
list percent_deficient if State=="TX"

//=== Round 3, CAA ===
use "$processed_data/Full_Download_of_SRF_Findings_Nov_3_2023.dta", clear
keep if Round=="Round 3"
keep if Agency=="State"
keep if Media=="CAA"
unique State
tab State
gen deficient=0
replace deficient=1 if FindingLevel=="Area for Attention" | FindingLevel=="Area for Improvement"
sort State
collapse (count) ct_findings=deficient (sum) deficient, by(State)
label var ct_findings "# of findings"
label var deficient "# of findings with Area for Attention or Area for Improvement"
gen percent_deficient=100*(deficient/ct_findings)
sum ct_findings percent_deficient, detail
list ct_findings if State=="TX"
list percent_deficient if State=="TX"

//=== Sample Characteristics for in-text discussion (page 16) ===
use "$processed_data/Air_Panel.dta", clear
drop if never_air_inv == 1     // Exclude entities never investigated

* Create panel time variable
egen t = group(year month) 
xtset RN_id t

* Generate difference variables for air investigations
forv h = 0/12 {
	gen p_air_inv_`h' = f`h'.p_air_inv - l1.p_air_inv
}
forv h = 2/12 {
	gen p_air_inv_neg`h' = l`h'.p_air_inv - l1.p_air_inv
}

egen RN_year = group(RN year)

unique RN_id
qui reghdfe p_air_inv_0  p_air_incident, absorb(RN_year t) cluster(RN_id)
keep if e(sample)
keep RN_id
duplicates drop
sort RN_id
save "$processed_data/temp.dta", replace

use "$processed_data/Air_Panel.dta"
drop if never_air_inv == 1
egen t = group(year month) 
xtset RN_id t
egen RN_year = group(RN year)

merge m:1 RN_id using "$processed_data/temp.dta"
keep if _merge==3
drop _merge
save "$processed_data/Air_est_sample_NSR_TitleV_investigated.dta", replace
unique RN_id
sum p_air_incident p_air_inv p_air_complaint_inv p_air_nocomplaint_inv p_air_nov p_air_noe p_ee_incident other_air_complaint_region
unique RN_id if county_missing==1
unique RN_id if county_missing==.
unique RN_id if TitleV==1 & NSR==1 //2280/23100= ~10%
unique RN_id if TitleV==1 & NSR==0 //116/23100= ~.5%
unique RN_id if TitleV==0 & NSR==1 //20704/23100= ~90%
keep RN_id air_incident
sort RN_id
collapse (sum) air_incident, by(RN_id)
tab air_incident

use "$processed_data/Air_est_sample_NSR_TitleV_investigated.dta", clear
sort RN_id
collapse (sum) p_air_incident p_air_inv p_air_complaint_inv p_air_nocomplaint_inv p_air_nov p_air_noe p_ee_incident, by(RN_id) 
sum p_air_incident p_air_inv p_air_complaint_inv p_air_nocomplaint_inv p_air_nov p_air_noe p_ee_incident

//=== Panel setup for sample ===
use "$processed_data/Air_Panel.dta", clear
drop if never_air_inv == 1
egen t = group(year month) 
xtset RN_id t

* Generate difference variables
forv h = 0/12 {
	gen p_air_inv_`h' = f`h'.p_air_inv - l1.p_air_inv
}
forv h = 2/12 {
	gen p_air_inv_neg`h' = l`h'.p_air_inv - l1.p_air_inv
}
egen RN_year = group(RN year)

unique RN_id
qui reghdfe p_air_inv_0  p_air_incident, absorb(RN_year t) cluster(RN_id)
keep if e(sample)
keep RN_id
duplicates drop
sort RN_id
save "$processed_data/temp.dta", replace

use "$processed_data/Air_Panel.dta", clear
drop if never_air_inv == 1
egen t = group(year month) 
xtset RN_id t
egen RN_year = group(RN year)
merge m:1 RN_id using "$processed_data/temp.dta"
keep if _merge==3
drop _merge
keep RN_id NSR TitleV
duplicates drop
sort RN_id
gen est_sample=1
unique RN_id
save "temp.dta", replace

use "$processed_data/Air_Panel.dta", clear
drop if never_air_inv == 1
egen t = group(year month) 
xtset RN_id t
egen RN_year = group(RN year)
sort RN_id
keep RN_id
duplicates drop
save "$processed_data/temp.dta", replace

//=== Footnote 23: 13% assigned to multiple industries ===
use "$processed_data/Air_Panel.dta", clear
drop if never_air_inv == 1
egen t = group(year month) 
xtset RN_id t
egen RN_year = group(RN year)
unique RN_id if missing_industry==0
unique RN_id if missing_industry==1
unique RN_id if single_industry==0
unique RN_id if single_industry==1
unique RN_id if single_industry==. // 2107/16356=13%

//=== In-text numbers and footnote 24 ===
use "$processed_data/incidents.dta", clear
drop if IncidentStatus=="REFERRED"
gen RN_id=substr(RegulatedEntity,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
rename ComplaintIncident CIN
gen temp = date(IncidentRecDate,"MDY")
drop IncidentRecDate 
rename temp IncidentRecDate
format IncidentRecDate %td
replace IncidentRecDate = . if IncidentRecDate < 0
gen year=year(IncidentRecDate)
gen month=month(IncidentRecDate)
gen mdate=ym(year,month)
format mdate %tm
// Restrict to 2003 to 2019
keep if mdate>=tm(2003m1) & mdate<=tm(2019m12)
keep RN_id CIN
duplicates drop
// CIN uniquely identifies observations at this point
sort RN_id
merge m:1 RN_id using "temp.dta"
/* Merge results: 21.5% of incidents linked to TitleV/NSR sample (_merge==3)
calculated as 23,775/110,745
83% of sample never receives a complaint (_merge==2)
calculated as 19,068/23,100. Thus, 17% receive a complaint (page 16) */
unique RN_id if _merge==3
/*4032 RN_id's in sample linked to complaints; total 23,775 complaints*/
keep if _merge==3
drop _merge
* Each RN_id may appear multiple times (one per complaint)
* Count complaints BEFORE collapsing to one row per RN_id
bysort RN_id: gen num_complaints = _N

* Collapse to one row per RN_id
duplicates drop RN_id CIN, force
bysort RN_id (CIN): keep if _n == 1

* Now each RN_id has only one row and correct complaint count

use "$processed_data/incidents.dta", clear
drop if IncidentStatus=="REFERRED"
gen RN_id=substr(RegulatedEntity,3,.)
label var RN_id "same as RN without 'RN'"
destring RN_id, replace
rename ComplaintIncident CIN
gen temp = date(IncidentRecDate,"MDY")
drop IncidentRecDate 
rename temp IncidentRecDate
format IncidentRecDate %td
replace IncidentRecDate = . if IncidentRecDate < 0
gen year=year(IncidentRecDate)
gen month=month(IncidentRecDate)
gen mdate=ym(year,month)
format mdate %tm
// Restrict to 2003 to 2019
keep if mdate>=tm(2003m1) & mdate<=tm(2019m12)
keep if Media=="AIR"
keep RN_id CIN
duplicates drop
// CIN
sort RN_id
merge m:1 RN_id using "temp.dta"
unique RN_id if _merge==3
/*3506 RN_id's in sample linked to complaints; total 15,117 complaints*/
keep if _merge==3
drop _merge
bysort RN_id: gen num_complaints = _N

* Collapse to one row per RN_id
duplicates drop RN_id CIN, force
bysort RN_id (CIN): keep if _n == 1
tabulate num_complaints
summarize num_complaints

* Count RN_id's with more than one complaint
count if num_complaints > 1
local more_than_one = r(N)

* Total RN_id's with at least one complaint
count
local with_complaint = r(N)

* Calculate percent (46% in text, pg 16)
display "Percent of facilities named in more than one non-referred air complaint incident: " 100*(`more_than_one'/`with_complaint')

* Additional percentiles for robustness, noted in footnote 24
count if num_complaints >= 5
local fiveplus = r(N)
display "Percent with 5+ complaints: " 100 * (`fiveplus' / `with_complaint')

count if num_complaints >= 10
local tenplus = r(N)
display "Percent with 10+ complaints: " 100 * (`tenplus' / `with_complaint')

summarize num_complaints
display "Maximum number of complaints for a single facility: " r(max)
// In-text percents below 46% stem from Table A1

//=== Last in-text numbers, page 26 ===

//=== Prepare activity codes data ===
use "$processed_data/ActivityCodes_clean.dta", clear
keep ActivityCode onsite_inv
// Per TCEQ email (7.22.22): "Generally speaking, UML03 are in-house, with only a few exceptions that are conducted on-site." So set onsite=0 if code1=="UML3".
replace ActivityCode=strtrim(ActivityCode)
replace onsite_inv=0 if ActivityCode=="UML3"
save "$processed_data/act_codes_temp.dta", replace

//=== Prepare investigator data ===
use "$raw_data/All Investigations 9-1-2002 – 8-31-2006.dta", clear
append using "$raw_data/All Investigations 9-1-2006 – 8-31-2010.dta"
append using "$raw_data/All Investigations 9-1-2010 – 8-31-2014"
append using "$raw_data/All Investigations 9-1-2014 – 8-31-2018.dta"
append using "$raw_data/All Investigations 9-1-2018 – 8-31-2023.dta"
gen RN_id=substr(RNNUMBER,3,.)
destring RN_id, replace
gen year=year(INVSTART_DATE)
// Keep only air investigations
keep if MEDIALIST=="AIR"
// Restrict to sample period
keep if year>2002 & year<2020
sort RN_id
merge m:1 RN_id using "$processed_data/temp.dta"
unique RN_id if _merge==1
unique RN_id if _merge==2
unique RN_id if _merge==3
keep if _merge==3
drop _merge
// Characterize investigation as onsite using activity codes (Figure A7 of paper)
split ACTIVITYCODES, gen(act_code) parse(,)
forvalues x = 1/6{
	replace act_code`x'=strtrim(act_code`x')
	sort act_code`x'
	rename act_code`x' ActivityCode
	merge m:1 ActivityCode using "$processed_data/act_codes_temp.dta"
	drop if _merge==2
	drop _merge
	rename ActivityCode act_code`x'
	rename onsite_inv onsite_inv`x'
}
egen onsite=rmax(onsite_inv1 onsite_inv2 onsite_inv3 onsite_inv4 onsite_inv5 onsite_inv6)
label var onsite "=1 if any activity codes linked to IN are mostly onsite; =0 if all non-missing values are zeroes"
// According to OCE Workplan Activities.xlsx, 3 activity codes relate to "air" or "all" complaints; explore investigation type
tab INVESTIGATIONTYPE if ACTIVITYCODES=="AIRCOMPL"
tab INVESTIGATIONTYPE if ACTIVITYCODES=="AFOPOULCMP"
tab INVESTIGATIONTYPE if ACTIVITYCODES=="APOCMPL"
gen complaint_inv=regexm(ACTIVITYCODES,"AIRCOMPL")
gen afocomplaint_inv=regexm(ACTIVITYCODES,"AFOPOULCMP")
replace complaint_inv=1 if afocomplaint_inv==1
drop afocomplaint_inv
egen total_inv_hours=rowtotal(PREINVESTIGATIONHOURS INVESTIGATIONHOURS POSTINVESTIGATIONHOURS TRAVELHOURS TRAININGHOURS MENTORINGHOURS QAHOURS)
sort INVESTIGATIONNUMBER
collapse (first) RN_id complaint_inv onsite (sum) PREINVESTIGATIONHOURS INVESTIGATIONHOURS POSTINVESTIGATIONHOURS TRAVELHOURS TRAININGHOURS MENTORINGHOURS QAHOURS total_inv_hours, by(INVESTIGATIONNUMBER)
// At this point, IN uniquely identifies observations

//=== Summary stats for investigation hours by type ===
tabstat PREINVESTIGATIONHOURS INVESTIGATIONHOURS POSTINVESTIGATIONHOURS TRAVELHOURS TRAININGHOURS MENTORINGHOURS QAHOURS total_inv_hours, statistics(mean sd median n) by(complaint_inv)
tabstat PREINVESTIGATIONHOURS INVESTIGATIONHOURS POSTINVESTIGATIONHOURS TRAVELHOURS TRAININGHOURS MENTORINGHOURS QAHOURS total_inv_hours if onsite==1, statistics(mean sd median n) by(complaint_inv)
// Offsite hours (non-complaint investigations)
tabstat PREINVESTIGATIONHOURS INVESTIGATIONHOURS POSTINVESTIGATIONHOURS TRAVELHOURS TRAININGHOURS MENTORINGHOURS QAHOURS total_inv_hours if onsite==0, statistics(mean sd median n)
// Investigations with unknown onsite/offsite status
tabstat PREINVESTIGATIONHOURS INVESTIGATIONHOURS POSTINVESTIGATIONHOURS TRAVELHOURS TRAININGHOURS MENTORINGHOURS QAHOURS total_inv_hours if onsite==., statistics(mean sd median n)

tabstat PREINVESTIGATIONHOURS INVESTIGATIONHOURS POSTINVESTIGATIONHOURS TRAVELHOURS TRAININGHOURS MENTORINGHOURS QAHOURS total_inv_hours if complaint_inv==1, statistics(mean sd median n) by(onsite) //10.5 median hours

gen totalinvestigationhours=PREINVESTIGATIONHOURS+INVESTIGATIONHOURS+POSTINVESTIGATIONHOURS+TRAVELHOURS
tabstat totalinvestigationhours if onsite==1, statistics(mean sd median n) by(complaint_inv) //7 median hours

//=== Clean up temp files ===
erase "$processed_data/act_codes_temp.dta"
erase "temp.dta"
erase "$processed_data/temp.dta"